output_folder <- getwd()
output_format <- ".pdf"
The code below downloads a compressed archive and uncompresses it:
download_and_extract <- function(url) {
temp_dir_path <- tempdir()
local_file_path <- file.path(temp_dir_path, basename(url))
download.file(url = url, destfile = local_file_path, quiet = TRUE)
unzipped_path <- R.utils::gunzip(local_file_path, overwrite = TRUE)
unzipped_path[1]
}
v35_data <- parse_hmp_qiime(otu_file = "http://downloads.hmpdacc.org/data/HMQCP/otu_table_psn_v35.txt.gz",
mapping_file = "http://downloads.hmpdacc.org/data/HMQCP/v35_map_uniquebyPSN.txt.bz2")
calculate_abundance <- function(data) {
mutate_taxa(data, abundance = vapply(obs(data), function(i) sum(data$obs_data[i, colnames(data$obs_data) %in% data$mapping$sample_id]), numeric(1)))
}
v35_data <- calculate_abundance(v35_data)
plot_all <- function(data, output_name, seed = 1) {
set.seed(seed)
data %>%
filter_taxa(abundance >= 100) %>%
plot(node_size = n_obs,
node_size_axis_label = "Number of OTUs",
node_color = abundance,
node_color_trans = "area",
node_color_axis_label = "Number of reads",
node_label = name,
node_label_max = 100,
overlap_avoidance = 1,
# initial_layout = "re", layout = "da",
output_file = file.path(output_folder, paste0(output_name, "--seed_", seed, output_format)))
}
plot_all(v35_data, "hmp--v35--all_data")
calculate_prop <- function(data, site) {
sample_cols <- as.character(unlist(data$mapping[data$mapping$body_site == site, "sample_id"]))
sample_cols <- sample_cols[sample_cols %in% colnames(data$obs_data)]
obs_indexes <- obs(data)
total_counts <- vapply(sample_cols, function(s) sum(data$obs_data[, s]), numeric(1))
col_name <- paste0(site, "_median_prop")
lapply(obs_indexes,
function(i) {
vapply(sample_cols,
function(s) sum(data$obs_data[i, s]) / total_counts[s],
numeric(1))})
}
calculate_prop_diff <- function(data, site_1, site_2) {
props_1 <- calculate_prop(data, site_1)
props_2 <- calculate_prop(data, site_2)
p_value <- mapply(function(x, y) wilcox.test(x, y)$p.value,
props_1, props_2)
p_value <- p.adjust(p_value, method = "bonferroni")
ifelse(p_value > 0.05 | is.nan(p_value), 0, vapply(props_1, median, numeric(1)) - vapply(props_2, median, numeric(1)))
}
plot_body_site_diff <- function(data, site_1, site_2, output_name, seed = 1) {
# calculate site difference
data$taxon_data$median_prop_diff <- calculate_prop_diff(data, site_1, site_2)
# plot
set.seed(seed)
data %>%
filter_taxa(abundance >= 100) %>%
# filter_taxa(n_supertaxa <= 4) %>%
plot(node_size_axis_label = "Number of OTUs",
node_size = n_obs,
node_color_axis_label = paste0(site_1, " only Both ", site_2, " only"),
node_color = median_prop_diff,
node_color_range = diverging_palette(),
node_color_trans = "area",
node_color_interval = max(abs(range(median_prop_diff))) * c(-1, 1),
edge_color_interval = max(abs(range(median_prop_diff))) * c(-1, 1),
node_label = name,
node_label_max = 150,
overlap_avoidance = 1,
# initial_layout = "re", layout = "da",
# maxiter = 20, fineiter = 20,
# margin_size = c(0.001, 0.001),
output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, "--seed_", seed, output_format)))
}
plot_body_site_diff(v35_data, "Throat", "Saliva", "hmp--v35")
plot_body_site_diff(v35_data, "Supragingival_plaque", "Subgingival_plaque", "hmp--v35")
plot_body_site_diff(v35_data, "Anterior_nares", "Buccal_mucosa", "hmp--v35")
plot_body_site_diff_small <- function(data, site_1, site_2, seed = 1) {
# calculate each site data
data <- calculate_body_site_abundance(data, site_1)
data <- calculate_body_site_abundance(data, site_2)
# filter out taxa without reads in either site
data <- filter_taxa(data,
data$taxon_data[[paste0(site_1, "_prop")]] != 0 | data$taxon_data[[paste0(site_2, "_prop")]] != 0,
supertaxa = TRUE)
# calculate scaled site difference
prop_1 <- data$taxon_data[[paste0(site_1, "_prop")]]
prop_2 <- data$taxon_data[[paste0(site_2, "_prop")]]
data$taxon_data$scaled_prop_diff <- (prop_1 - prop_2) / mean(c(prop_1, prop_2))
# plot
set.seed(seed)
data %>%
filter_taxa(abundance >= 100) %>%
filter_taxa(n_supertaxa <= 4) %>%
plot(node_size_axis_label = "Number of OTUs",
node_size = n_obs,
node_color_axis_label = paste0(site_1, " only Both ", site_2, " only"),
node_color = scaled_prop_diff,
node_color_range = diverging_palette(),
node_color_trans = "area",
node_color_interval = round(max(abs(range(scaled_prop_diff)))) * c(-1, 1),
edge_color_interval = round(max(abs(range(scaled_prop_diff)))) * c(-1, 1),
node_label = name,
node_label_max = 50,
overlap_avoidance = 1)
# initial_layout = "re", layout = "da",
# maxiter = 20, fineiter = 20,
# margin_size = c(0.001, 0.001),
# output_file = file.path(output_folder, paste0(output_name, "--", site_1, "_vs_", site_2, "--seed_", seed, output_format)))
}
body_sites <- c("Stool", "Saliva", "Tongue_dorsum", "Buccal_mucosa")
combinations <- t(combn(seq_along(body_sites), 2))
layout_matrix <- matrix(rep(NA, (length(body_sites))^2), nrow = length(body_sites))
for (index in 1:nrow(combinations)) {
layout_matrix[combinations[index,1], combinations[index,2]] <- index
}
bs_combinations <- t(apply(combinations, MARGIN = 1, function(x) body_sites[x]))
plots <- lapply(1:nrow(bs_combinations),
function(i) plot_body_site_diff_small(v35_data, bs_combinations[i, 1], bs_combinations[i, 2]))
do.call(gridExtra::grid.arrange, ncol = length(body_sites) - 1, nrow = length(body_sites) - 1,